分组
1> 将数据拆分成组
df_data = {'team' : ['A', 'B', 'C', 'B', 'A', 'D', 'C'],
'rank' : [1, 2, 2, 3, 3, 4, 1],
'year' : [2018, 2019, 2020, 2019, 2018, 2020, 2021],
'points' : [87.5, 81, 86, 79.5, 76, 89, 75]
}
df = pd.DataFrame(df_data)
print(df.groupby('team'))
# 输出结果:
#
2> 查看分组
df_data = {'team' : ['A', 'B', 'C', 'B', 'A', 'D', 'C'],
'rank' : [1, 2, 2, 3, 3, 4, 1],
'year' : [2018, 2019, 2020, 2019, 2018, 2020, 2021],
'points' : [87.5, 81, 86, 79.5, 76, 89, 75]
}
df = pd.DataFrame(df_data)
print(f'按一列分组: \n{df.groupby("team").groups}')
# 输出结果:
# 按一列分组:
# {'A': Int64Index([0, 4], dtype='int64'), 'B': Int64Index([1, 3], dtype='int64'), 'C': Int64Index([2, 6], dtype='int64'), 'D': Int64Index([5], dtype='int64')}
print(f'按多列分组: \n{df.groupby(["team","year"]).groups}')
# 输出结果:
# 按多列分组:
# {('A', 2018): Int64Index([0, 4], dtype='int64'), ('B', 2019): Int64Index([1, 3], dtype='int64'), ('C', 2020): Int64Index([2], dtype='int64'), ('C', 2021): Int64Index([6], dtype='int64'), ('D', 2020): Int64Index([5], dtype='int64')}
3> 迭代遍历分组
df_data = {'team' : ['A', 'B', 'C', 'B', 'A', 'D', 'C'],
'rank' : [1, 2, 2, 3, 3, 4, 1],
'year' : [2018, 2019, 2020, 2019, 2018, 2020, 2021],
'points' : [87.5, 81, 86, 79.5, 76, 89, 75]
}
df = pd.DataFrame(df_data)
grouped = df.groupby('year')
for name,group in grouped:
print(name)
print(group)
# 输出结果:
# 2018
# team rank year points
# 0 A 1 2018 87.5
# 4 A 3 2018 76.0
# 2019
# team rank year points
# 1 B 2 2019 81.0
# 3 B 3 2019 79.5
# 2020
# team rank year points
# 2 C 2 2020 86.0
# 5 D 4 2020 89.0
# 2021
# team rank year points
# 6 C 1 2021 75.0
4> 选择一个分组
df_data = {'team' : ['A', 'B', 'C', 'B', 'A', 'D', 'C'],
'rank' : [1, 2, 2, 3, 3, 4, 1],
'year' : [2018, 2019, 2020, 2019, 2018, 2020, 2021],
'points' : [87.5, 81, 86, 79.5, 76, 89, 75]
}
df = pd.DataFrame(df_data)
grouped = df.groupby('year')
print(grouped.get_group(2019))
# 输出结果:
# team rank year points
# 1 B 2 2019 81.0
# 3 B 3 2019 79.5
5> 聚合
df_data = {'team' : ['A', 'B', 'C', 'B', 'A', 'D', 'C'],
'rank' : [1, 2, 2, 3, 3, 4, 1],
'year' : [2018, 2019, 2020, 2019, 2018, 2020, 2021],
'points' : [87.5, 81, 86, 79.5, 76, 89, 75]
}
df = pd.DataFrame(df_data)
grouped = df.groupby('year')
print(f'按 points 聚合:\n{grouped["points"].agg(np.mean)}')
# 输出结果:
# 按 points 聚合:
# year
# 2018 81.75
# 2019 80.25
# 2020 87.50
# 2021 75.00
# Name: points, dtype: float64
grouped1 = df.groupby('team')
print(f'按 team分组查看大小:\n{grouped.agg(np.size)}')
# 输出结果:
# 按 team分组查看大小:
# rank year points
# team
# A 2 2 2.0
# B 2 2 2.0
# C 2 2 2.0
# D 1 1 1.0
6> 使用多个聚合函数
df_data = {'team' : ['A', 'B', 'C', 'B', 'A', 'D', 'C'],
'rank' : [1, 2, 2, 3, 3, 4, 1],
'year' : [2018, 2019, 2020, 2019, 2018, 2020, 2021],
'points' : [87.5, 81, 86, 79.5, 76, 89, 75]
}
df = pd.DataFrame(df_data)
grouped = df.groupby('team')
agg = grouped['points'].agg([np.sum,np.mean,np.std])
print(agg)
# 输出结果:
# sum mean std
# team
# A 163.5 81.75 8.131728
# B 160.5 80.25 1.060660
# C 161.0 80.50 7.778175
# D 89.0 89.00 NaN
7> 转换
df_data = {'team' : ['A', 'B', 'C', 'B', 'A', 'D', 'C'],
'rank' : [1, 2, 2, 3, 3, 4, 1],
'year' : [2018, 2019, 2020, 2019, 2018, 2020, 2021],
'points' : [87.5, 81, 86, 79.5, 76, 89, 75]
}
df = pd.DataFrame(df_data)
grouped = df.groupby('team')
score = lambda x : (x - x.mean())/x.std()*10
print(grouped['points'].transform(score))
# 输出结果:
# 0 7.071068
# 1 7.071068
# 2 7.071068
# 3 -7.071068
# 4 -7.071068
# 5 NaN
# 6 -7.071068
# Name: points, dtype: float64
8> 过滤
df_data = {'team' : ['A', 'B', 'C', 'B', 'A', 'D', 'C'],
'rank' : [1, 2, 2, 3, 3, 4, 1],
'year' : [2018, 2019, 2020, 2019, 2018, 2020, 2021],
'points' : [87.5, 81, 86, 79.5, 76, 89, 75]
}
df = pd.DataFrame(df_data)
grouped = df.groupby('team')
filter = grouped.filter(lambda x:len(x) >= 2)
print(filter)
# 输出结果:
# team rank year points
# 0 A 1 2018 87.5
# 1 B 2 2019 81.0
# 2 C 2 2020 86.0
# 3 B 3 2019 79.5
# 4 A 3 2018 76.0
# 6 C 1 2021 75.0